import urllib
import traceback
import config
import xmlrpclib
import datetime
import dateutil.parser
import re
import os
import io
from bs4 import BeautifulSoup

def initDrupal():
  """Sets up connection to BulkPub module in Drupal
  Returns: drupal object
  """
  proxy = xmlrpclib.ServerProxy(config.drupalURL, allow_none=True)
  return proxy

def getBody(soup):
  """extracts content from HTML document, strips out tables.  Input is a BeautifulSoup object.
  Returns: BeautifulSoup object
  """
  content = soup.find('div', class_='entry-content')
  #Remove all the bad tags
  for badTags in ['table', 'tbody', 'tr', 'td', 'script']:
    for tag in soup(badTags):
      tag.unwrap()
  return content

def renameImages(soup, images, drupal):
  """Rewrites all <img> tags in the BeautifulSoup (soup) object for new URL's.  'images' is a dict containing image names
  and URL's, generated by the uploadImage().  The 'src' attribute in each img tag is analysed to find the
  image filename.  The corresponding URL is then extracted from images{} and written into the <img>.  If the image does not
  exist in 'images', then the file is downloaded
  returns the BeautifulSoup object.
  """
  for original in soup("img"):
    if 'sites.google.com' in original['src']:
      filePath = original['src']
      fileName = urllib.unquote( re.search(u'(.*\/)(.*)', filePath).group(2) ).decode()
      # Google Sites does some sinful things, like jamming parameters onto the end of filenames... 
      # so let's clean that up:
      fileName = re.sub(u'\?.*', '', fileName)
      #print fileName
      newPath = images[fileName]
      original['src'] = newPath
    #Skip google images - adsense, +1, etc
    elif ('google' not in original['src']) and ('addthis' not in original['src']):
      #Download the image, add to images{}
      tmpName = original['src'].split('/')[-1]
      tmpName = re.sub(u'\?.*', '', tmpName)
      #YUCK!  deal with specific case of horrid image URL
      if tmpName == 'v3imagegallery-i.php':
        tmpName = 'v3imagegallery-i.jpeg'
      tmpPath = os.path.join('/tmp', tmpName)
      externalImage = urllib.urlretrieve( original['src'], tmpPath )
      imagePath = '/'.join( externalImage[0].split('/')[:-1] ) 
      imageName = externalImage[0].split('/')[-1]
      images[imageName] = imagePath
      newImageURL = uploadImage(drupal, imageName, images)
      original['src'] = newImageURL       
    #Remove the <A HREF> tags from the images
    if original.parent.name == 'a':
      original.parent.unwrap()

  return soup

      
def rewriteLinks(soup):
  """Accepts a BeautifulSoup object, and rewrites all <a> tags linking to
  URL's beginning with "sites.google.com" to be relative links.
  Returns relative-ised soup object.
  """
  for link in soup('a'):
    if not 'href' in link.attrs:
      link.decompose()
    elif 'sites.google.com/site/urbanastronomer' in link.attrs['href']:
      relURL = '/' + '/'.join( link.attrs['href'].split('/')[5:] )
      link.attrs['href'] = relURL.split('?')[0]
  return soup
  

def findAllImages(walk):
  """Identifies images in a directory tree
  Returns: a dict of their imageName: Path
  """
  images = {}
  for entry in walk:
    for filename in entry[2]:
      #We're cheating here, since in my specific case, all files are either an image, or index.html
      if filename != "index.html":
        images[filename] = entry[0]
  return images

def findAllHTML(walk):
  """Identifies articles, defined as a directory containing index.html.
  Returns: (dict of articleName: Path), (list of pages)
  """
  articles = {}
  for page in walk:
    if 'index.html' in page[2]:
      path = page[0]
      title = path.split('/')[-1]
      articles[title] = path
  return articles

def uploadImage(proxy, imageName, imageList):
  """Uploads an image (named in imageName), from path to Drupal installation defined in config.
  Returns: URL of image as a string
  """
  image = os.path.join(imageList[imageName], imageName)
  # labels for file types
  ftypes = {".png":"image/png", ".jpg":"image/jpeg", ".jpeg":"image/jpeg", ".gif":"image/gif", ".tif":"image/tiff", ".tiff":"image/tiff", ".svg":"image/svg+xml", ".bmp":"image/bmp"}
  # initialize return
  ret=None
  # is the file there?
  if not os.path.exists(image):
    print "Error!",image,"does not exist"
    return ret
  # get the file type
  sp = os.path.splitext(imageName)
  spext = sp[1]
  spext = spext.lower()
  # check for an image type we can handle
  if spext in ftypes:
    ftp = ftypes[spext]
  else:
    print "Error!",f,spext,"unknown image type" 
    return ret
    
  # start creating the data to send over XMLRPC
  data = {'name': imageName, 'type':ftp, 'bits': ""}

  # upload the image to Drupal
  try:
    fd = io.FileIO(image,mode='r+b')
  except:
    print "Error!",image,"could not be opened" 
    return None

  # read in the file
  rawbits = fd.read()
  # convert it from binary to encoded text
  data['bits'] = xmlrpclib.Binary(rawbits)
  sendf=False

  # use the appropriate method
  meth = proxy.bulkpub.newImage
    
  # upload the file to Drupal
  try:
    ret=meth(ftp,config.user,config.password,data)
    sendf=True
    
  except:
    print "newImage failed!" 
    traceback.print_exc()
    sendf =False
  # if the image was not sent, return an error
  if not sendf:
    return None
  # return the URL where the image was stored
  else:
    #Convert URL from absolute to relative path
    absURL = ret["url"]
    relURL = re.sub(r'http://[\w\.]*/', '/', absURL)
    return relURL

def createPage(soup, title, date, path):
  """Creates page data from HTML document presented as a BeautifulSoup object
  Returns a structure containing the article plus metadata, 
  ready for injection into Drupal via uploadHTML()
  """
  testpage = {}
  testpage['title'] = title 
  keywords = path.split('/')[1:-1]
  print "Keywords:   ", keywords
  testpage['keywords'] = keywords
  testalias = '/'.join( path.split('/')[1:] )
  #print testalias
  testpage['alias'] = testalias
  testpage['format'] = "full_html"
  testpage['content'] = unicode(soup)
  testpage['date'] = date
  return testpage

def uploadHTML(proxy, pagedata):
  """Takes a generated page and uploads it to drupal.
  Page is a BeautifulSoup object containing the HTML code.
  Returns the page URL as a string.
  """
  title = pagedata['title']
  page  = pagedata['content']
  terms = pagedata['keywords']
  date = dateutil.parser.parse(pagedata['date'])
  print "Terms:  ", terms
  vocabulary = 'tags'

  # use the appropriate method
  meth = proxy.bulkpub.newPage

  # add any terms to the vocabulary.
  # they have to be added to the vocabulary before they
  # are attached to a content instance.
  for term in terms:
    print "adding term",term,"to",vocabulary 
    addVocabularyTerm(proxy, vocabulary, term)

  # create the data to send over RPC
  #  title
  data = {"title":title}
  #  page content(text)
  data["description"] = page
  #  topic tags and chosen vocabulary
  if len(terms)>0:
      data['terms'] = terms
      data['vocabulary'] = vocabulary

  #  set alias
  if 'alias' in pagedata:
      data['alias'] = pagedata['alias']
  #  set text format
  if 'format' in pagedata:
      data['format'] = pagedata['format']
 
  # upload the page to Drupal
  try:
     ret=meth("article", config.user, config.password, data, True)
     tmpNode = proxy.blogger.getPost(True, ret, config.user, config.password)
     tmpNode['dateCreated'] = date
     print date, pagedata['date']
     print tmpNode['link']
     print pagedata['alias']
     proxy.blogger.editPost(True, ret, config.user, config.password, tmpNode, True)
  
  except:
    ret=None
    traceback.print_exc()
    print("newPage failed")
  
  return ret


def addVocabularyTerm(proxy, vocab, term):

    try:
       ret = proxy.bulkpub.addVocabularyTerm(config.user, config.password, vocab, term)

    except:
       ret=None
       print("addVocabularyTerm error!",vocab,term)

    return ret




def main():
  drupal = initDrupal() 
  walk = os.walk(config.targetPath)
  dirTree = []
  for page in walk:
    dirTree.append(page)
  if config.test:
    print "Config loaded"
  imageList = findAllImages(dirTree)
  imageURLs = {}
  for image in imageList:
    print "image ", image, " uploading...   "
    imageDecoded = urllib.unquote(image).decode()
    imageURLs[imageDecoded] = uploadImage(drupal, imageDecoded, imageList)
    print "   ...done"
  articleDict = findAllHTML(dirTree)

  for title in articleDict:
    path = articleDict[title]
    file = open(os.path.join(path, 'index.html'))
    print "Processing ", title, " from path: ", path
    soup = BeautifulSoup(file)
    date = soup.abbr['title']
    fullTitle = soup.title.text
    cleanBody = renameImages(getBody(soup), imageURLs, drupal)
    body = rewriteLinks(cleanBody)
    #print body.prettify()
    page = createPage(body, fullTitle, date, path)
    
    print uploadHTML(drupal, page)
    
    #exit()
    
    

  

if __name__ == "__main__":
  main()
